{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocess Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:19.287587Z",
     "start_time": "2020-06-23T09:14:19.044368Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "tqdm.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:19.405574Z",
     "start_time": "2020-06-23T09:14:19.399264Z"
    }
   },
   "outputs": [],
   "source": [
    "train_data_path = \"./train_preliminary/\"\n",
    "train_final_data_path = \"./train_semi_final/\"\n",
    "test_data_path = \"./test/\"\n",
    "save_path = './processed_data'\n",
    "if not os.path.exists(save_path):\n",
    "    os.makedirs(save_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:49.428631Z",
     "start_time": "2020-06-23T09:14:19.736820Z"
    }
   },
   "outputs": [],
   "source": [
    "df_ad = pd.concat([pd.read_csv(os.path.join(train_data_path, \"ad.csv\")), pd.read_csv(os.path.join(train_final_data_path, \"ad.csv\")), pd.read_csv(os.path.join(test_data_path, \"ad.csv\"))])\n",
    "\n",
    "df_click = pd.concat([pd.read_csv(os.path.join(train_data_path, \"click_log.csv\")), pd.read_csv(os.path.join(train_final_data_path, \"click_log.csv\")), pd.read_csv(os.path.join(test_data_path, \"click_log.csv\"))])\n",
    "                            \n",
    "df_train_user = pd.concat([pd.read_csv(os.path.join(train_data_path, \"user.csv\")), pd.read_csv(os.path.join(train_final_data_path, \"user.csv\"))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:56.835602Z",
     "start_time": "2020-06-23T09:14:51.514222Z"
    }
   },
   "outputs": [],
   "source": [
    "# 复赛和初赛、训练集和测试集点击的广告有重复部分，去重\n",
    "df_ad = df_ad.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理空值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:57.693595Z",
     "start_time": "2020-06-23T09:14:56.837521Z"
    }
   },
   "outputs": [],
   "source": [
    "df_ad[df_ad==\"\\\\N\"] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:57.994201Z",
     "start_time": "2020-06-23T09:14:57.696027Z"
    }
   },
   "outputs": [],
   "source": [
    "df_ad.fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:58.880662Z",
     "start_time": "2020-06-23T09:14:57.996021Z"
    }
   },
   "outputs": [],
   "source": [
    "df_ad = df_ad.astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:14:59.158436Z",
     "start_time": "2020-06-23T09:14:58.882481Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "creative_id               0\n",
       "ad_id                     0\n",
       "product_id          1575509\n",
       "product_category          0\n",
       "advertiser_id             0\n",
       "industry             183159\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(df_ad == 0).sum() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:15:01.094494Z",
     "start_time": "2020-06-23T09:15:01.028618Z"
    }
   },
   "outputs": [],
   "source": [
    "# 缺失值最终填充为最大值，0留给pad\n",
    "df_ad.loc[df_ad['product_id']==0, 'product_id'] = df_ad['product_id'].max() + 1\n",
    "df_ad.loc[df_ad['industry']==0, 'industry'] = df_ad['industry'].max() + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 将广告的信息合并到用户点击数据表中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:15:40.259129Z",
     "start_time": "2020-06-23T09:15:02.212605Z"
    }
   },
   "outputs": [],
   "source": [
    "df_click = df_click.merge(df_ad, on='creative_id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 排序，确保后期分组时仍然按时间有序\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:17:33.078845Z",
     "start_time": "2020-06-23T09:15:40.261019Z"
    }
   },
   "outputs": [],
   "source": [
    "df_click.sort_values(by=['user_id', 'time'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:17:34.960089Z",
     "start_time": "2020-06-23T09:17:33.081124Z"
    }
   },
   "outputs": [],
   "source": [
    "df_click = df_click.astype(np.int32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:17:34.975674Z",
     "start_time": "2020-06-23T09:17:34.962094Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>time</th>\n",
       "      <th>user_id</th>\n",
       "      <th>creative_id</th>\n",
       "      <th>click_times</th>\n",
       "      <th>ad_id</th>\n",
       "      <th>product_id</th>\n",
       "      <th>product_category</th>\n",
       "      <th>advertiser_id</th>\n",
       "      <th>industry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>71892857</th>\n",
       "      <td>20</td>\n",
       "      <td>1</td>\n",
       "      <td>877468</td>\n",
       "      <td>1</td>\n",
       "      <td>773445</td>\n",
       "      <td>44315</td>\n",
       "      <td>5</td>\n",
       "      <td>29455</td>\n",
       "      <td>106</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101809561</th>\n",
       "      <td>20</td>\n",
       "      <td>1</td>\n",
       "      <td>209778</td>\n",
       "      <td>1</td>\n",
       "      <td>188507</td>\n",
       "      <td>136</td>\n",
       "      <td>2</td>\n",
       "      <td>9702</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117324299</th>\n",
       "      <td>20</td>\n",
       "      <td>1</td>\n",
       "      <td>821396</td>\n",
       "      <td>1</td>\n",
       "      <td>724607</td>\n",
       "      <td>44315</td>\n",
       "      <td>5</td>\n",
       "      <td>7293</td>\n",
       "      <td>326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99387603</th>\n",
       "      <td>39</td>\n",
       "      <td>1</td>\n",
       "      <td>1683713</td>\n",
       "      <td>1</td>\n",
       "      <td>1458878</td>\n",
       "      <td>44315</td>\n",
       "      <td>5</td>\n",
       "      <td>14668</td>\n",
       "      <td>326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42203506</th>\n",
       "      <td>40</td>\n",
       "      <td>1</td>\n",
       "      <td>122032</td>\n",
       "      <td>1</td>\n",
       "      <td>109959</td>\n",
       "      <td>1334</td>\n",
       "      <td>2</td>\n",
       "      <td>11411</td>\n",
       "      <td>336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44609273</th>\n",
       "      <td>75</td>\n",
       "      <td>4000000</td>\n",
       "      <td>3596158</td>\n",
       "      <td>1</td>\n",
       "      <td>3096233</td>\n",
       "      <td>44315</td>\n",
       "      <td>18</td>\n",
       "      <td>36668</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115480909</th>\n",
       "      <td>75</td>\n",
       "      <td>4000000</td>\n",
       "      <td>3642395</td>\n",
       "      <td>1</td>\n",
       "      <td>3135640</td>\n",
       "      <td>1076</td>\n",
       "      <td>2</td>\n",
       "      <td>18422</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12280290</th>\n",
       "      <td>76</td>\n",
       "      <td>4000000</td>\n",
       "      <td>366858</td>\n",
       "      <td>1</td>\n",
       "      <td>331268</td>\n",
       "      <td>1036</td>\n",
       "      <td>2</td>\n",
       "      <td>36890</td>\n",
       "      <td>319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51542431</th>\n",
       "      <td>76</td>\n",
       "      <td>4000000</td>\n",
       "      <td>3333680</td>\n",
       "      <td>1</td>\n",
       "      <td>2868147</td>\n",
       "      <td>1469</td>\n",
       "      <td>2</td>\n",
       "      <td>32830</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95070891</th>\n",
       "      <td>77</td>\n",
       "      <td>4000000</td>\n",
       "      <td>3697105</td>\n",
       "      <td>1</td>\n",
       "      <td>3181227</td>\n",
       "      <td>107</td>\n",
       "      <td>2</td>\n",
       "      <td>52421</td>\n",
       "      <td>317</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>133878445 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           time  user_id  creative_id  click_times    ad_id  product_id  \\\n",
       "71892857     20        1       877468            1   773445       44315   \n",
       "101809561    20        1       209778            1   188507         136   \n",
       "117324299    20        1       821396            1   724607       44315   \n",
       "99387603     39        1      1683713            1  1458878       44315   \n",
       "42203506     40        1       122032            1   109959        1334   \n",
       "...         ...      ...          ...          ...      ...         ...   \n",
       "44609273     75  4000000      3596158            1  3096233       44315   \n",
       "115480909    75  4000000      3642395            1  3135640        1076   \n",
       "12280290     76  4000000       366858            1   331268        1036   \n",
       "51542431     76  4000000      3333680            1  2868147        1469   \n",
       "95070891     77  4000000      3697105            1  3181227         107   \n",
       "\n",
       "           product_category  advertiser_id  industry  \n",
       "71892857                  5          29455       106  \n",
       "101809561                 2           9702         6  \n",
       "117324299                 5           7293       326  \n",
       "99387603                  5          14668       326  \n",
       "42203506                  2          11411       336  \n",
       "...                     ...            ...       ...  \n",
       "44609273                 18          36668        36  \n",
       "115480909                 2          18422        45  \n",
       "12280290                  2          36890       319  \n",
       "51542431                  2          32830        21  \n",
       "95070891                  2          52421       317  \n",
       "\n",
       "[133878445 rows x 9 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_click"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 按user_id分组处理数据，将每个用户的点击序列聚集到一起"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:17:35.004590Z",
     "start_time": "2020-06-23T09:17:34.977189Z"
    }
   },
   "outputs": [],
   "source": [
    "def process_group(df):\n",
    "    dic = {}\n",
    "    for name in ['time', 'creative_id', 'click_times', 'ad_id', 'product_id', 'product_category', 'advertiser_id', 'industry']:\n",
    "        dic[name] = df[name].values\n",
    "    return pd.Series(dic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:55:36.971060Z",
     "start_time": "2020-06-23T09:17:35.008056Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4000000/4000000 [37:58<00:00, 1755.39it/s] \n"
     ]
    }
   ],
   "source": [
    "df_click_group = df_click.groupby('user_id').progress_apply(process_group)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 将年龄和性别标签合并到df_train中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T09:55:39.259780Z",
     "start_time": "2020-06-23T09:55:36.981737Z"
    }
   },
   "outputs": [],
   "source": [
    "df_click_group = df_click_group.join(df_train_user.set_index('user_id'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存成pickle文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T10:12:11.170617Z",
     "start_time": "2020-06-23T10:12:11.159485Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "time                 object\n",
       "creative_id          object\n",
       "click_times          object\n",
       "ad_id                object\n",
       "product_id           object\n",
       "product_category     object\n",
       "advertiser_id        object\n",
       "industry             object\n",
       "age                 float64\n",
       "gender              float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_click_group.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-23T11:24:22.800804Z",
     "start_time": "2020-06-23T11:19:23.235831Z"
    }
   },
   "outputs": [],
   "source": [
    "df_click_group.to_pickle(os.path.join(save_path, 'processed_data_numerical.pkl'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
